# -*- coding: utf-8 -*-
'''
下载三国演义120回，使用bs4
'''
import requests
from bs4 import BeautifulSoup

main_url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
proxies = {"http": None, "https": None}
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
# 二进制获取再格式化去中文乱码
main_page = requests.get(url=main_url, proxies=proxies, headers=headers).content
html = str(main_page, 'utf-8')

# 实例化bs4对象并定位主页面的a标签
soup = BeautifulSoup(html, 'lxml')
main_location = soup.select('.book-mulu > ul > li > a')
#print(main_location)

for i in main_location:
    # 获取标签和详情页超链接
    title = i.string
    hyperlinks = 'https://www.shicimingju.com' + i['href']
    #print(title,hyperlinks)

    detail_page = requests.get(url=hyperlinks, proxies=proxies,headers=headers).content
    son_html = str(detail_page, 'utf-8')
    # 因为详情页内有很多br标签用于分段落，直接获取text会导致没有换行，所以先将页面内的br标签全部替换成\n
    new_html = son_html.replace('<br>', '\n')    # 网页就是一个字符串对象，可以使用replace替换

    soup = BeautifulSoup(new_html, 'lxml')
    detail_location = soup.find('div',class_='chapter_content').text

    with open('三国演义.txt', 'a+', encoding='utf-8') as f:
        f.write(title + '\n' + detail_location + '\n')
    print(title, '保存成功')